Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/refactor for reduced complexity #11

Closed
wants to merge 255 commits into from
Closed

Feature/refactor for reduced complexity #11

wants to merge 255 commits into from

Conversation

tutrie
Copy link
Contributor

@tutrie tutrie commented Aug 9, 2021

No description provided.

@irinaespejo irinaespejo mentioned this pull request Aug 9, 2021
2 tasks
Comment on lines 1 to 47
import numpy as np
import itertools
from .. import utils
import torch

torch.cuda.set_device(0)


def classlabels(values, thresholds):
labels = np.zeros(values.shape)
for j in range(len(thresholds) - 1):
print("[{}, {}]".format(thresholds[j], thresholds[j + 1]))
within = np.logical_and(thresholds[j] < values, values < thresholds[j + 1])
labels[within] = j + 1
return labels


def confusion_matrix(gps, scandetails):
thresholds = np.concatenate([[-np.inf], scandetails.thresholds, [np.inf]])
diagX = utils.mesh2points(
utils.mgrid(scandetails.rangedef), scandetails.rangedef[:, 2]
)

diagX = diagX[~scandetails.invalid_region(diagX)]

labels = list(range(1, len(thresholds)))

confusion_list, predlabels_list, truelabels_list = [], [], []
for i, gp in enumerate(gps):
predy = gps[i].predict(diagX)
truthy = scandetails.truth_functions[i](diagX)
predlabels = classlabels(predy, thresholds)
truelabels = classlabels(truthy, thresholds)

confusion_matrix = np.zeros((len(labels), len(labels)))
for pred, true in itertools.product(labels, labels):
print("pred {}/true {}".format(pred, true))
predlabels_when_true = predlabels[truelabels == true]
numerator = np.sum(np.where(predlabels_when_true == pred, 1, 0))
denominator = len(predlabels_when_true)
print("{}/{}".format(numerator, denominator))
confusion_matrix[true - 1, pred - 1] = numerator / denominator

predlabels_list.append(predlabels)
truelabels_list.append(truelabels)
confusion_list.append(confusion_matrix.tolist())
return confusion_list, predlabels_list, truelabels_list, diagX, labels

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unsure why the folder is called _diagnosis/ what does the underscore mean here?

from .. import utils
import torch

torch.cuda.set_device(0)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sorry this line is legacy from my code, suggestion: delete

Comment on lines 1 to 56
import numpy as np
import logging

log = logging.getLogger(__name__)

def get_class_labels(y,class_labels,minmaxes):
assigned_labels = -np.ones_like(y)
for class_idx in class_labels:
class_selector = np.logical_and(y>=minmaxes[class_idx][0], y<minmaxes[class_idx][1])
assigned_labels[class_selector] = class_idx
assert np.min(assigned_labels) >= 0
return assigned_labels

def get_status(testy,esti_y,class_labels,minmaxes):
true_labels = get_class_labels(testy,class_labels,minmaxes)
esti_labels = get_class_labels(esti_y,class_labels,minmaxes)
status = np.concatenate([esti_labels.reshape(-1,1),true_labels.reshape(-1,1)], axis=-1)
return status

def get_confusion_matrices(statuses,class_labels):
confusion_matrix = -np.ones((len(statuses),2,2))
for sidx,sta in enumerate(statuses):
for i in class_labels:
for j in class_labels:
test_val = i
truth_val = j
condition_on_truth = sta[sta[:,1]==truth_val]
is_test_condition_truth = condition_on_truth[condition_on_truth[:,0]==test_val]
prob = len(is_test_condition_truth)/len(condition_on_truth)
confusion_matrix[sidx][i][j] = prob #p(i|j)
return confusion_matrix


def confusion_callback(X,y_list,gps,scandetails):
testX, testy_list = scandetails.testdata()
log.debug('computing confusion matrix based on {} testing points'.format(len(testX)))
estimatey_list = [gps[i].predict(testX) for i in range(len(scandetails.functions))]

class_labels = np.arange(len(scandetails.thresholds)+1)

boundaries = [-np.inf] + scandetails.thresholds + [np.inf]
minmaxes = [[boundaries[i],boundaries[i+1]] for i in class_labels]
statuses = [
get_status(yl,estimatey_list[i],class_labels,minmaxes)
for i,yl in enumerate(testy_list)
]
matrices = get_confusion_matrices(statuses,class_labels)
topline = np.mean([np.mean(np.diag(cm)) for cm in matrices])
return {'t': topline, 'matrices': np.asarray(matrices).tolist()}

def diagnose(X,y_list,gps, scandetails, callbacks = None):
callbacks = callbacks or {
'confusion': confusion_callback,
'npoints': lambda X,*args: len(X),
}
return {name: c(X,y_list,gps,scandetails) for name,c in callbacks.items()}

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I assume this is legacy code from my repo. Let's remove it so that the PR is cleaner

Comment on lines 1 to 57
import numpy as np
import itertools

from .. import utils


def regular_grid_generator(
scandetails, central_range=[5, 20], nsamples_per_grid=15, min_points_per_dim=2
):
ndim = len(scandetails.rangedef[:, 2])
grids = set(
x
for y in range(*central_range)
for x in itertools.combinations_with_replacement([y, y - 1, y - 2], ndim)
)
grids = [g for g in grids if np.all(np.array(g) >= min_points_per_dim)]
grids = sorted(grids, key=lambda k: np.product(k))

def makegrids(sizes):
for s in range(nsamples_per_grid):
los = np.random.uniform(0, 0.1, size=ndim)
his = np.random.uniform(0.9, 1.0, size=ndim)
yield np.array(
np.meshgrid(
*[np.linspace(los[i], his[i], sizes[i]) for i in range(ndim)]
)
)

for g in grids:
for mesh in makegrids(g):
X = utils.mesh2points(mesh, mesh.shape[1:])
for i in range(ndim):
vmin, vmax = (
scandetails.rangedef[i][0],
scandetails.rangedef[i][1],
)
X[:, i] = X[:, i] * (vmax - vmin) + vmin
X = X[~scandetails.invalid_region(X)]
yield X, g


def latin_hypercube_generator(
scandetails, nsamples_per_npoints=50, point_range=[4, 100]
):
ndim = len(scandetails.rangedef[:, 2])
import pyDOE

for npoints in range(*point_range):
for s in range(nsamples_per_npoints):
sample_n = npoints
while True:
print("sampling", sample_n)
X = pyDOE.lhs(ndim, samples=sample_n)

for i in range(ndim):
vmin, vmax = (
scandetails.rangedef[i][0],

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This file is handy to preserve. I would rename it to samplers.py instead of old_samplers.py so that we know it is not legacy but actual useful code. Not sure if it should be under _diagnosis but fine for now.

Copy link

@irinaespejo irinaespejo left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

More comments on core files of the refactoring

Comment on lines 1 to 79
import numpy as np
import datetime
import torch
import gpytorch

from . import core

torch.cuda.set_device(0)


def gridsearch(gps, X, scandetails):
thresholds = [-np.inf] + scandetails.thresholds + [np.inf]
print("info_gain")
acqval = np.array(
[
core.info_gain(xtest, gps, thresholds, scandetails.meanX)
for xtest in scandetails.acqX
]
)
print(acqval)
newx = None
for i, cacq in enumerate(scandetails.acqX[np.argsort(acqval)]):
if cacq.tolist() not in X.tolist():
print("taking new x. best non-existent index {} {}".format(i, cacq))
newx = cacq
return newx, acqval
else:
print("{} is good but already there".format(cacq))
print("returning None.. something must be wrong")
return None, None


def gridsearch_gpytorch(gps, X, scandetails):
thresholds = [-np.inf] + scandetails.thresholds + [np.inf]
print("info_gain")
acqval = np.array(
[
core.info_gain_gpytorch(xtest, gps, thresholds, scandetails.meanX)
for xtest in scandetails.acqX
]
)

newx = None
for i, cacq in enumerate(scandetails.acqX[np.argsort(acqval)]):
if cacq.tolist() not in X.tolist():
print("taking new x. best non-existent index {} {}".format(i, cacq))
newx = cacq
return newx, acqval
else:
print("{} is good but already there".format(cacq))
print("returning None.. something must be wrong")
return None, None


def batched_gridsearch(gps, X, scandetails, gp_maker=None, batchsize=1):
newX = np.empty((0, X.shape[-1]))
my_gps = gps
my_y_list = [gp.y_train_ for gp in my_gps]
myX = X

acqinfos = []

while True:
newx, acqinfo = gridsearch(my_gps, myX, scandetails)
newX = np.concatenate([newX, np.asarray([newx])])
acqinfos.append(acqinfo)
if (len(newX)) == batchsize:
print("we got our batch")
return newX, acqinfos

print("do fake update")
myX = np.concatenate([myX, newX])

newy_list = [gp.predict(newX) for gp in my_gps]
for i, newy in enumerate(newy_list):
print("new y i: {} {}".format(i, newy))
my_y_list[i] = np.concatenate([my_y_list[i], newy])
print("build fake gps")
my_gps = [gp_maker(myX, my_y) for my_y in my_y_list]

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't understand the underscore in the folder. Does this mean there is an optimize/ folder somehwere else. If o, what is the difference?

@@ -0,0 +1,268 @@
import numpy as np

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same comment as with _optimize/__init__.py, underscore?

entropy_grid[p_j > 0] -= torch.log(torch.exp(p_j[p_j > 0])) * torch.log(torch.exp(torch.log(p_j[p_j > 0])))

acq_cand_vals = entropy_grid
self.acq_vals = torch.clone(entropy_grid)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is torch.clone necessary? Can we do self.acq_vals = acq_cand_vals or am I missing something?

self.batch = batch
self.acq_vals = None

def _acquire(self, gp, thresholds, X_pointsgrid, x_candidate):

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why does class MES() have a method called acquire but PES has _acquire ? I think without underscore is fine

Comment on lines 1 to 15
def PPES(gp, testcase, thresholds, x_candidate):
"""
Calculates information gain of choosing x_candidadate as next point to evaluate.
Performs this calculation with the Predictive Entropy Search approximation weighted by the posterior.
Roughly,
PES(x_candidate) = int Y(x)dx { H[Y(x_candidate)] - E_{S(x=j)} H[Y(x_candidate)|S(x=j)] }
Notation: PES(x_candidate) = int dx H0 - E_Sj H1

"""

# compute predictive posterior of Y(x) | train data
raise NotImplmentedError(
"Should be same strcture as PES but the cumulative info gain is weighted"
)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Remove file altogether, sorry it's legacy

Comment on lines +58 to +81
def acquire(self, gp, thresholds, X_pointsgrid):
"""
Calculates information gain of choosing x_candidate as next point to evaluate.
Performs this calculation with the Predictive Entropy Search approximation. Roughly,
PES(x_candidate) = int dx { H[Y(x_candidate)] - E_{S(x=j)} H[Y(x_candidate)|S(x=j)] }
Notation: PES(x_candidate) = int dx H0 - E_Sj H1

"""

# compute predictive posterior of Y(x) | train data
# likelihood = gp.likelihood
# gp.eval()
# likelihood.eval()
acquisition_values = []

# for x_candidate in X_grid:
for x_candidate in X_pointsgrid:
x_candidate = x_candidate.view(1, -1).to(device=self.device, dtype=self.dtype)
acquisition_values.append(self._acquire(gp, thresholds, X_pointsgrid, x_candidate))

acq_cand_vals = torch.Tensor(acquisition_values).to(device=self.device, dtype=self.dtype)
self.acq_vals = torch.clone(acq_cand_vals)

return X_pointsgrid[self.get_first_max_index(gp, X_pointsgrid, acq_cand_vals)]

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok I see the utility of _acquire here. Same comment with torch.clone. The code for PES looks good to me.

Comment on lines +10 to +28
def set_params(self, **params):
"""
Set the parameters of this acquisition function.
Parameters
----------
**params : dict
Function parameters.
Returns
-------
self : object
Function instance.
"""
if not params:
# Simple optimization to gain speed (inspect is slow)
return self
for key, value in params.items():
setattr(self, key, value)

return self

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We don't use acquisition function with parameters, that would be a bit meta. Let's erase the method if you can't give a use case.

from collections import defaultdict


class AcquisitionFunction(object):

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, why is there no init method?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No init since it is an abstract interface and not an abstract object. this ensures all the future acq classes which implement it will be able to use the ask-and-tell API

Copy link

@irinaespejo irinaespejo Aug 10, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

end ✔️

self.init_n_points = init_n_points # future place to store init points that were a snapshot of past training
self.dtype = torch.float64
self._invalid_region = None

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

self.device = device ?? with device coming from

if cuda available:
device = torch.device('cuda')
else:
device = torch.device('cpu')
If you have another way of incorporating device lmk

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

device comes from the algorithm options. bit convoluted at the moment i think... also this was an oversight bc it should not always be torch.float64, unless it should?

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If it's too much hassle let's leave the device and dtype like that but please remmeber to add it to the ongoing TO DO list in this issue #12

end ✔️

Comment on lines +8 to +40
class _Learner(object):
def __init__(self, problem_details: ExcursionProblem = None, algorithm_options: dict = None):
self.problem_details = problem_details
self.options = algorithm_options

def ask(self, npoints: int = None):
"""
Suggest a new point to evaluate.
Parameters
----------
npoints : int, default: None
Determine how many points to ask for batch acquisition.
Returns
-------
points : object
Some kind of array object e.g. torch.Tensor or numpy.ndarray
"""
raise NotImplementedError()

def tell(self, x, y):
raise NotImplementedError()

def evaluate(self, x):
raise NotImplementedError()

def evaluate_and_tell(self, x):
raise NotImplementedError()

def run(self, n_iterations, plot_result=False):
raise NotImplementedError()

def evaluate_metrics(self):
raise NotImplementedError()

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We discussed _Learner in a meeting and agreed to remove? @lukasheinrich @tutrie

Comment on lines 1 to 6
[submodule "DileptonReinterpretationProj"]
path = DileptonReinterpretationProj
url = https://gitlab.cern.ch/prieck/DileptonReinterpretationProj.git
[submodule "excursion"]
path = excursion
url = https://github.com/diana-hep/excursion.git

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Everything under the folder 'excursion/madgraph5atlasval/' should get removed, legacy 🔥 📄

Comment on lines 1 to 37
#
# ALGORITHM OPTS
#

example: '1Dtoyanalysis'

#number of initial points
ninit: 1

#number of iterations
nupdates: 1

##Define the init training data type
init_type: 'random'

#if init_type == 'custom' then fill in the lines below
#X_train:
# [0.,0.]

#likelihood
likelihood:
type: 'GaussianLikelihood'
epsilon: 0.

#model
model:
type: 'GridGP'
kernel: 'RBF'
prior: 'Constant'


#acquisition function
acq:
acq_type: 'MES'
batch: False
batchtype: None
batchsize: None

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok the testing/ folder is legacy but should stay to make unit tests with pytest. I'm gonna put the refactoring of the testing/ folder to adapt it to your code in a TO DO list. No action to take for now.

Comment on lines 1 to 71
import json
import numpy as np
import pkg_resources
import pickle
from excursion.utils import mgrid, mesh2points
import torch
import sklearn.preprocessing


datafile = pkg_resources.resource_filename(
"excursion", "unfactored/data/checkmate_dense.json"
)

def modify(zv):
return np.log(zv) - np.log(0.05)


truthX, truthy_obs, truthy_exp = [], [], []
for p, _, result in json.load(open(datafile))["precomputed"]:
if p[0] < p[1] + 200:
continue
truthX.append(p)
truthy_obs.append(
max(float(result[1]["observed_CLs"]), 0.001) if result[1] else 0.001
)
truthy_exp.append(
max(float(result[1]["expected_CLs"]), 0.001) if result[1] else 0.001
)

truthX = np.array(truthX)
truthy_obs = np.array(truthy_obs)
truthy_obs = modify(truthy_obs)

truthy_exp = np.array(truthy_exp)
truthy_exp = modify(truthy_exp)


scaler = sklearn.preprocessing.MinMaxScaler()
scaler.fit(truthX)
truthX = scaler.transform(truthX)

picklefile = pkg_resources.resource_filename(
"excursion", "unfactored/data/checkmate.pkl"
)
d = pickle.load(open(picklefile, "rb"))

print('d', type(d))
print(d)


def truth_obs(X):
from scipy.interpolate import griddata
grid_data = torch.from_numpy(griddata(truthX,truthy_obs,X))
return grid_data




def invalid_region(x):
oX = scaler.inverse_transform(x)
return oX[:, 0] < oX[:, 1] + 202


thresholds = torch.Tensor([0.05])
true_functions = [truth_obs]
n_dims = 2

rangedef = np.array([[0.0, 1.0, 101], [0.0, 1.0, 101]])
plot_meshgrid = mgrid(rangedef)
X_plot = mesh2points(plot_meshgrid, rangedef[:, 2])
X = torch.from_numpy(X_plot)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should the unfactored/ folder get 🔥 ? I think so, we have completely changed the testcase import module in favor of ExcrusionProblem class

Comment on lines +15 to +16
Set the parameters of this initial point generator.
Parameters

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A Gaussian Process has hyperparameters and GPyTorch takes control of that. I'm unsure what parameters set_params refers to.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is responsible for setting the initial parameters in the builders. it simplifies constructing objects for use with the ask and tell api. they dont have to change the learner or the optimizer. just make a new acquisition function and model and can do custom stuff with very little work. I can create an example, with botorch maybe?

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you could create a very simple use case for this, that'd be helpful. Thank you!
Before creating an example, maybe the famous diagram connecting the classes and objects is enough for me to understand the utility.

Comment on lines 41 to 42
targets = targets + self.epsilon * MultivariateNormal(torch.zeros(len(targets)), torch.eye(len(targets)))\
.sample(torch.Size([])).to(device=self.device, dtype=self.dtype)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's better to put NotImplementedError here.
The issue is subtle, when there is a noisy observation it means that querying the black box function ans receiving target_observed = target_true + epsilon. Epsilon is a random variable and everytime it is queried like in MultivariateNormal(torch.zeros(len(targets)), torch.eye(len(targets))) it will give a different error ⚠️
This can introduce bugs along the way because if we are querying target_observed anywhere else in the code they are going to be different quantities ❗

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

discussed on Slack, y(x) is only called once. no action to take

end ✔️

Comment on lines 5 to 41
class _Optimizer(object):
def __init__(self, details: ExcursionProblem, device: str, n_funcs: int = None,
base_estimator: str or list or ExcursionModel = "ExactGP", n_initial_points=None,
initial_point_generator="random", acq_func: str = "MES", fit_optimizer=None,
base_estimator_kwargs=None, fit_optimizer_kwargs=None, acq_func_kwargs=None, jump_start: bool = True):
raise NotImplementedError()

def ask(self, n_points=None, batch_kwarg={}):
raise NotImplementedError()

def _ask(self):
raise NotImplementedError()

def tell(self, x, y, fit=True):
raise NotImplementedError()

def _tell(self, x, y, fit=True):
"""Perform the actual work of incorporating one or more new points.
See `tell()` for the full description.
This method exists to give access to the internals of adding points
by side stepping all input validation and transformation."""
raise NotImplementedError()

def update_next(self):
"""Updates the value returned by opt.ask(). Does so by calling the acquisition func. Useful if a parameter
was updated after ask was called."""
raise NotImplementedError()

def fit(self):
"""Calls the model's fit method. Give user access to this command as well.
"""
raise NotImplementedError()

def get_result(self):
"""Returns the most recent result as a new object. self.result stores the log if log = true.
"""
raise NotImplementedError()

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same as with _Learner, do we need a private class? @lukasheinrich @tutrie

@@ -0,0 +1,49 @@
#

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

either we use a folder called test_suite/ or testing/ the name testing/ has the advantage that pytest automatically detects this is the folder to use for unit tests.

@@ -0,0 +1,518 @@
from .builders import build_sampler, build_model, build_acquisition_func

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why have two folders _optimize/ and optimize/ ?

if acq_function == "mes":
acq_function = MES(device=kwargs['device'], dtype=kwargs['dtype'])
if acq_function == "pes":
acq_function = PES()

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

PES() should, at least, get the same arguments as MES: device=kwargs['device'], dtype=kwargs['dtype']

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it does, set_params takes care of it. I will change it was an oversight

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

cool

end ✔️

Copy link

@irinaespejo irinaespejo left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Last comments

made acquisition builder signature for MES and PES the same by removing dtype and device from MES letting it be set by set_params instead
discovered bug with shared memory in the result object for the list objects. self.result.update updates lists of result objects for dif problems
we use the excursionresult to store data don't need two copies
@tutrie tutrie closed this Aug 16, 2021
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

Successfully merging this pull request may close these issues.

2 participants